The data are from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE156123 - Analysis of cerebrovascular dysfunction caused by chronic social defeat in mice. Overall design: wild-type mice were exposed to stressor for 10 min every day for 1 day, 7 days and 14 days. Three to Four animals per group were studied, for a total of 19 animals.
library(oligo)
library(stringr)
library(tidyr)
library(glue)
library(limma)
library(ReportingTools)
library(lattice)
library(sva)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(janitor)
library(affycoretools)
library(pd.mta.1.0)
library(clusterProfiler)
library(gplots)
source("library/age_library.R")
base <- 'GSE156123_RAW/celfiles/'
celfiles <- list.files(base, full = TRUE)
raw_data <- read.celfiles(celfiles)
## Reading in : GSE156123_RAW/celfiles//GSM4725016_1_ML02_HC_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725017_2_ML03_HC_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725018_6_ML08_HC_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725019_5_ML07_HC_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725020_4_ML06_1dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725021_19_ML22_1dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725022_18_ML21_1dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725023_25_ML29_7dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725024_17_ML20_7dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725025_16_ML19_7dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725026_9_ML12_7dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725027_15_ML18_14dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725028_14_ML17_14dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725029_13_ML16_14dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725030_12_ML15_14dSD-10min_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725031_31_ML53_14dSD-7dREC_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725032_30_ML52_14dSD-7dREC_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725033_29_ML51_14dSD-7dREC_MTA-1_0_.CEL
## Reading in : GSE156123_RAW/celfiles//GSM4725034_28_ML50_14dSD-7dREC_MTA-1_0_.CEL
pheno_data <- readr::read_delim("GSE156123_RAW/sheet_file.csv", delim = ",") %>%
dplyr::select(2, 4, 7) %>%
magrittr::set_colnames(c("sample_name", "sample_group", "group_type")) %>%
dplyr::mutate(
cel_file = celfiles,
sample_group = str_replace_all(sample_group, " ", "_"),
sample_group = str_to_lower(sample_group) %>% factor() %>% relevel("home_caged")) %>%
dplyr::select(sample_name, sample_group, group_type, cel_file)
## Parsed with column specification:
## cols(
## Group = col_character(),
## Accession = col_character(),
## Title = col_character(),
## `Source name` = col_character(),
## Strain = col_character(),
## Genotype = col_character(),
## Diet = col_character(),
## `Tissue/cell type` = col_character()
## )
row.names(pheno_data) <- pheno_data$sample_name
## Warning: Setting row names on a tibble is deprecated.
sampleNames(raw_data) <- pheno_data$sample_name
metadata <- data.frame(
labelName = colnames(pheno_data),
labelDescription = c("Name", "Used factor", "Group", "File"),
stringsAsFactors = FALSE
)
pheno_data <- AnnotatedDataFrame(data = pheno_data, varMetadata = metadata)
phenoData(raw_data) <- Biobase::combine(phenoData(raw_data), pheno_data)
pData(raw_data)
pheno_data
## An object of class 'AnnotatedDataFrame'
## rowNames: 1 2 ... 3 (19 total)
## varLabels: sample_name sample_group group_type cel_file
## varMetadata: labelName labelDescription
image(raw_data, which=17, transfo = log2)
MAplot(raw_data, groups = pData(raw_data)$sample_group, pairs = TRUE)
boxplot(raw_data[1:10000,], "all", las=3)
fit_plm <- fitProbeLevelModel(raw_data)
## Background correcting... OK
## Normalizing... OK
## Summarizing... OK
## Extracting...
## Estimates... OK
## StdErrors... OK
## Weights..... OK
## Residuals... OK
## Scale....... OK
image(fit_plm, which=17)
image(fit_plm, which = 17, type = "sign.residuals")
RLE(fit_plm)
NUSE(fit_plm)
library(pd.mta.1.0)
pd.mta.1.0
## Class........: AffyHTAPDInfo
## Manufacturer.:
## Genome Build.: hg19
## Chip Geometry: 2572 rows x 2680 columns
#Normalization
I assumed that first two samples could be low quality one, so I decited take the rest for continios work.
norm_data <- rma(raw_data[,3:19])
## Background correcting
## Normalizing
## Calculating Expression
feature_data <- annotateEset(norm_data, pd.mta.1.0, columns = c("PROBEID", "ENTREZID", "SYMBOL", "GENENAME"), type = 'core')
#tail(fData(feature_data))
groups <- as.factor(pData(norm_data)$sample_group)
names(groups) <- sampleNames(norm_data)
plot_hc(exprs(norm_data), color_by = norm_data$sample_group,
color_by_lab = "Sample group")
pca_plot(norm_data)
heatmap.2(exprs(norm_data)[1:1000,],col=redgreen(100), key=FALSE, scale="row", density.info="none", trace="none",cexRow=0.25, cexCol=0.7 )
group <- pData(norm_data)$sample_group %>% factor() %>% relevel("home_caged")
dea_model <- model.matrix(~ group)
colnames(dea_model)[1] <- "Intercept"
dea_model
## Intercept groupstressed_for_1_day groupstressed_for_14_days
## 1 1 0 0
## 2 1 0 0
## 3 1 1 0
## 4 1 1 0
## 5 1 1 0
## 6 1 0 0
## 7 1 0 0
## 8 1 0 0
## 9 1 0 0
## 10 1 0 1
## 11 1 0 1
## 12 1 0 1
## 13 1 0 1
## 14 1 0 0
## 15 1 0 0
## 16 1 0 0
## 17 1 0 0
## groupstressed_for_14_days_+_recreation_for_7_days groupstressed_for_7_days
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## 7 0 1
## 8 0 1
## 9 0 1
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 1 0
## 15 1 0
## 16 1 0
## 17 1 0
## attr(,"assign")
## [1] 0 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
fit <- lmFit(norm_data, dea_model) %>% eBayes()
colnames(fit)
## [1] "Intercept"
## [2] "groupstressed_for_1_day"
## [3] "groupstressed_for_14_days"
## [4] "groupstressed_for_14_days_+_recreation_for_7_days"
## [5] "groupstressed_for_7_days"
topTable(fit, coef = 5)
output_groups <- levels(groups)
output_groups <- output_groups[output_groups != "home_cage"]
rep_theme <- reporting.theme()
lattice.options(default.theme = rep_theme)
REPORT_DIR <- "html_report"
REPORT_N_GENES <- 50
REPORT_LFC_THRESHOLD <- 1
REPORT_P_VALUE_THRESHOLD <- 0.1
REPORT_P_VALUE_ADJUST_METHOD <- "fdr"
for (group in output_groups) {
cat(glue("\nDEA for group '{group}': ", .trim = FALSE))
tryCatch(
{
# This is an object (and a file) to which we are publishing.
de_report <- HTMLReport(
shortName = group,
title = glue("{group} vs. control"),
reportDirectory = REPORT_DIR
)
# We can control what to take from topTable.
# Here, we want top REPORT_N_GENES DE genes (ranked by p-value adjusted by FDR) with minimal LFC of 1 (i.e. two-fold up or down).
# We won't be much conservative and set adjusted p-value cutoff to 0.1. That means we expect 10% of our DE genes to be false positives.
publish(
fit,
de_report,
eSet = norm_data,
factor = groups,
coef = glue("group{group}"),
n = REPORT_N_GENES,#variable
lfc = REPORT_LFC_THRESHOLD, #1
pvalueCutoff = REPORT_P_VALUE_THRESHOLD, #0.1
adjust.method = REPORT_P_VALUE_ADJUST_METHOD
)
finish(de_report)
cat(glue("<a href='{DE_REPORT_DIR}/{group}.html' target='_blank'>report</a>"))
},
error = function(e) {
cat(e$message)
})
}
##
## DEA for group 'home_caged': subscript out of bounds
## DEA for group 'stressed_for_1_day':
## object 'DE_REPORT_DIR' not found
## DEA for group 'stressed_for_14_days':
## object 'DE_REPORT_DIR' not found
## DEA for group 'stressed_for_14_days_+_recreation_for_7_days':
## object 'DE_REPORT_DIR' not found
## DEA for group 'stressed_for_7_days':
## object 'DE_REPORT_DIR' not found
There’s we have wery interesting result. From the first day under stress there’s underexpressed gene for asporin, that trend is maintained till the end of experement. On the second week there’s up to 9 differently expressed genes.
data_long <- exprs(feature_data)[10000:10010, ] %>%
as.data.frame() %>%
tibble::rownames_to_column("PROBEID") %>%
tidyr::pivot_longer(-PROBEID, names_to = "sample_name", values_to = "E") %>%
dplyr::left_join(fData(feature_data), by = "PROBEID") %>%
dplyr::left_join(pData(feature_data), by = "sample_name")
tail(data_long)
#GSEA
For Gene Set Enrichment Analysis I decited to take the most different group from control one.
toGSEA <- topTable(fit, coef = "groupstressed_for_14_days", number=72000)
featureDataGSEA <- dplyr::left_join(cbind(PROBEID = rownames(toGSEA), toGSEA), fData(feature_data), by = "PROBEID", copy=TRUE)
entrez_ids <- bitr(featureDataGSEA$SYMBOL, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Mm.eg.db")
## Loading required package: org.Mm.eg.db
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
##
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(featureDataGSEA$SYMBOL, fromType = "SYMBOL", toType =
## "ENTREZID", : 24.84% of input gene IDs are fail to map...
entrez_ids <- entrez_ids[!duplicated(entrez_ids$ENTREZID), ]
res_dex_shrink <- dplyr::left_join(entrez_ids, featureDataGSEA, by = "SYMBOL", copy=TRUE)
entrez_wald <- res_dex_shrink$logFC
names(entrez_wald) <- res_dex_shrink$ENTREZID
# Sort by decreasing Wald statistic.
entrez_wald <- entrez_wald[order(entrez_wald, decreasing = TRUE)]
#gseKEGG(entrez_wald)
gsea_kegg_results <- gseKEGG(
geneList = entrez_wald,
# KEGG organism ID
organism = "mmu",
# Key type is ENTREZ ID.
keyType = "ncbi-geneid",
# Run 10 000 permutation tests for each pathway.
nPerm = 10000,
# Correct p-values for FDR.
pAdjustMethod = "fdr",
# FDR adjusted p-value threshold.
# We are OK with 10% of false positives among all pathways called significant.
pvalueCutoff = 0.1,
verbose = TRUE
)
## Reading KEGG annotation online:
##
## Reading KEGG annotation online:
##
## Reading KEGG annotation online:
## preparing geneSet collections...
## GSEA analysis...
## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.07% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## leading edge analysis...
## done...
as.data.frame(gsea_kegg_results)
dotplot(gsea_kegg_results, showCategory = 15, x = "GeneRatio", font.size = 10)